##################### -  Environment - ##################### 
rm(list=objects())
setwd("~/Desktop/Experiments/NetScience/Data")
set.seed(0)

library(missSBM)
library(igraph)
library(softImpute)

##################### -  Data pre-processing - ##################### 
data <- read.graph("netscience.gml", format = "gml")
keep_compo <- which(components(data)$csize > 5) # keep compoqwnents containing at least 5 nodes
vert_ids <- V(data)[components(data)$membership %in% keep_compo]
g <- induced_subgraph(data, vert_ids)

# Adjacency Matrix
A <- as.matrix(as_adjacency_matrix(g))
mean(colSums(A)) # 5
max(colSums(A)) # 34
min(colSums(A)) # 1
N <- dim(A)[1]

### Repeat sampling and prediction
M <- 100
Error_missSBM <- rep(NA, M)
Error_Var<- rep(0, M)
Error_softImpute <- rep(NA, M)

# Save FPR and TPR
Recall_GRID <- (0:100)/100
Precision_VAR <- rep(0, 101)
Precision_missSBM <- rep(0, 101)
Precision_softImpute <- rep(0, 101)

for (m in 1:M){
  # Sample edges
  Omega_undir <- rbinom(n=N*(N-1)/2, size=1, prob=0.5)
  Omega <- matrix(0,N,N)
  Omega[upper.tri(Omega)] <- Omega_undir
  Omega <- (Omega+t(Omega))
  
  # set A_obs the observed adjacency matrix
  A_obs <- A
  diag(A_obs) <- NA
  A_obs[Omega == 0] <- NA
  N <- dim(A)[1]
  
  ##################### -  Predict the unobserved entries - ##################### 
  
  # Using missSBM
  estimator_missSBM <- missSBM::estimateMissSBM(
    adjacencyMatrix = A_obs, 
    vBlocks = 2:10,
    sampling = "dyad",
    control = list(trace = 0))$bestModel$fittedSBM
  estimate_Theta_missSBM <- estimator_missSBM$expectation
  
  # Using the Variational Estimator
  z_est <- estimator_missSBM$memberships
  K <- length(unique(z_est))
  estimate_Q_Var <- sapply(1:K, function(a) sapply(1:K, function(b) mean(A_obs[z_est == a, z_est == b], na.rm =T)))
  estimate_Q_Var[is.na(estimate_Q_Var)] <- 0
  estimate_Theta_Var <- sapply(1:N, function(i) sapply(1:N, function(j) estimate_Q_Var[z_est[i], z_est[j]]))
  diag(estimate_Theta_Var) <- 0
  
  # Using SVD
  SVD <- softImpute::softImpute(A_obs, rank.max = K, lambda = 0)
  estimate_Theta_softImpute <- SVD$u %*% diag(SVD$d, nrow = K, ncol = K) %*% t(SVD$v)
  estimate_Theta_softImpute <- pmin(pmax(estimate_Theta_softImpute, 0),1)
  
  # SE on the unobserved entries
  Omega <- 1 - Omega
  Omega[Omega == 0] <- NA
  
  Error_Var[m] <- sum(Omega*(A - estimate_Theta_Var)**2/sum(Omega*A, na.rm = T), na.rm = T)
  Error_missSBM[m] <- sum(Omega*(A - estimate_Theta_missSBM)**2/sum(Omega*A, na.rm = T), na.rm = T)
  Error_softImpute[m] <- sum(Omega*(A - estimate_Theta_softImpute)**2/sum(Omega*A, na.rm = T), na.rm = T)
  
  # Compute ROC
  Omega[is.na(Omega)] <- 0
  # Variational Estimator
  thresholds <- c(-1, unique(round(sort(estimate_Theta_Var), 2)), 2)
  L <- length(thresholds)
  Recall <- rep(NA, L)
  Precision <- rep(NA, L)
  for (l in 1:L){
    A_VAR <- 1*(estimate_Theta_Var>thresholds[l])
    TP <- sum((A_VAR == 1 & Omega==1 & A == 1))
    FP <- sum((A_VAR == 1 & Omega==1 & A == 0))
    TN <- sum((A_VAR == 0 & Omega==1 & A == 0))
    FN <- sum((A_VAR == 0 & Omega==1 & A == 1))
    Recall[l] <- TP/(TP+FN)
    Precision[l] <- TP/(TP+FP)
  }
  Precision[is.na(Precision)] <- 1 # convention
  Precision_temp <- approx(x = Recall, y = Precision, xout = Recall_GRID, method = "linear")$y
  Precision_VAR <- Precision_VAR + Precision_temp
  
  # missSBM
  thresholds <- c(-1, unique(round(sort(estimate_Theta_missSBM), 2)), 2)
  L <- length(thresholds)
  Recall <- rep(NA, L)
  Precision <- rep(NA, L)
  for (l in 1:L){
    A_missSBM <- 1*(estimate_Theta_missSBM>thresholds[l])
    TP <- sum((A_missSBM == 1 & Omega==1 & A == 1))
    FP <- sum((A_missSBM == 1 & Omega==1 & A == 0))
    TN <- sum((A_missSBM == 0 & Omega==1 & A == 0))
    FN <- sum((A_missSBM == 0 & Omega==1 & A == 1))
    Recall[l] <- TP/(TP+FN)
    Precision[l] <- TP/(TP+FP)
  }
  Precision[is.na(Precision)] <- 1 # convention
  Precision_temp <- approx(x = Recall, y = Precision, xout = Recall_GRID, method = "linear")$y
  Precision_missSBM <- Precision_missSBM + Precision_temp
  
  # softImputeiational Estimator
  thresholds <- c(-1, unique(round(sort(estimate_Theta_softImpute), 2)), 2)
  L <- length(thresholds)
  Recall <- rep(NA, L)
  Precision <- rep(NA, L)
  for (l in 1:L){
    A_softImpute <- 1*(estimate_Theta_softImpute>thresholds[l])
    TP <- sum((A_softImpute == 1 & Omega==1 & A == 1))
    FP <- sum((A_softImpute == 1 & Omega==1 & A == 0))
    TN <- sum((A_softImpute == 0 & Omega==1 & A == 0))
    FN <- sum((A_softImpute == 0 & Omega==1 & A == 1))
    Recall[l] <- TP/(TP+FN)
    Precision[l] <- TP/(TP+FP)
  }
  Precision[is.na(Precision)] <- 1 # convention
  Precision_temp <- approx(x = Recall, y = Precision, xout = Recall_GRID, method = "linear")$y
  Precision_softImpute <- Precision_softImpute + Precision_temp
  print(m)
}
print(paste0("SE of Variational Estimator : ", round(mean(Error_Var),3))) # 0.857
print(paste0("SE of missSBM : ", round(mean(Error_missSBM),3))) # 0.869
print(paste0("SE of SVD : ", round(mean(Error_softImpute),3))) # 0.894

Precision_VAR <- Precision_VAR/M
Precision_missSBM <- Precision_missSBM/M
Precision_softImpute <- Precision_softImpute/M

plot(x = Recall_GRID[-1], y = Precision_VAR[-1], type = 'l', col = 'blue', 
     xlab = "Recall", ylab = "Precision", ylim = c(0,1), xlim = c(0,1))
lines(x = Recall_GRID[-1], y = Precision_missSBM[-1], col = 'red')
lines(x = Recall_GRID[-1], y = Precision_softImpute[-1], col = 'green')
lines(x = Recall_GRID[-1], y = rep(mean(A, na.rm = T), length(Recall_GRID[-1])), col = 'black', lty = 3)

# TPF = faux positifs/négatifs (négatifs = nombreux)
# Precision = vrais positifs/positifs (négatifs = nombreux)
